Bank Marketing dataset - Data Analysis

Data description - Studying the data

Data cleaning

Data exploratory analysis - Data Visualization

Importing the required libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import math
import plotly.graph_objects as pl

Loading the dataset

In [2]:
#Getting the current working directory 
os.getcwd()
Out[2]:
'/Users/aishwaryamaddimsetty'
In [3]:
#Changing the working directory location to load the dataset.
os.chdir('/Users/aishwaryamaddimsetty/Downloads')
In [4]:
#Importing the dataset from my local computer and assigning it to a variable:
data = pd.read_csv('DSA_DataSet.csv')

Studying the dataset - basic summary statistics, understanding what the dataset comprises etc.

In [5]:
#Printing the first 5 rows of the dataset - just a look at the data
data.head(5)
Out[5]:
age job marital education default housing loan contact month day_of_week ... pdays previous poutcome emp.var.rate cons.price.idx cons.conf.idx euribor3m nr.employed ModelPrediction y
0 56 housemaid married basic.4y no no no telephone may mon ... 999 0 nonexistent 1.1 93.994 -36.4 4.857 5191.0 0.932750 no
1 57 services married high.school unknown no no telephone may mon ... 999 0 nonexistent 1.1 93.994 -36.4 4.857 5191.0 0.953579 no
2 37 services married high.school no yes no telephone may mon ... 999 0 nonexistent 1.1 93.994 -36.4 4.857 5191.0 0.945724 no
3 40 admin. married basic.6y no no no telephone may mon ... 999 0 nonexistent 1.1 93.994 -36.4 4.857 5191.0 0.933875 no
4 56 services married high.school no no yes telephone may mon ... 999 0 nonexistent 1.1 93.994 -36.4 4.857 5191.0 0.940996 no

5 rows × 22 columns

In [6]:
#Printing the number of rows and columns of the dataset:
data.shape
Out[6]:
(41188, 22)
In [7]:
#Printing the details about the columns of the dataset:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 22 columns):
age                41188 non-null int64
job                41188 non-null object
marital            41188 non-null object
education          41188 non-null object
default            41188 non-null object
housing            41188 non-null object
loan               41188 non-null object
contact            41188 non-null object
month              41188 non-null object
day_of_week        41188 non-null object
duration           41188 non-null int64
campaign           41188 non-null int64
pdays              41188 non-null int64
previous           41188 non-null int64
poutcome           41188 non-null object
emp.var.rate       41188 non-null float64
cons.price.idx     41188 non-null float64
cons.conf.idx      41188 non-null float64
euribor3m          41188 non-null float64
nr.employed        41188 non-null float64
ModelPrediction    41188 non-null float64
y                  41188 non-null object
dtypes: float64(6), int64(5), object(11)
memory usage: 6.9+ MB

This shows the data has no missing values so no handling required for missing data in data cleaning part

In [8]:
#Printing the datatypes of the columns:
data.dtypes
Out[8]:
age                  int64
job                 object
marital             object
education           object
default             object
housing             object
loan                object
contact             object
month               object
day_of_week         object
duration             int64
campaign             int64
pdays                int64
previous             int64
poutcome            object
emp.var.rate       float64
cons.price.idx     float64
cons.conf.idx      float64
euribor3m          float64
nr.employed        float64
ModelPrediction    float64
y                   object
dtype: object
In [9]:
#Printing the basic summary statistics of the dataset:
data.describe()
Out[9]:
age duration campaign pdays previous emp.var.rate cons.price.idx cons.conf.idx euribor3m nr.employed ModelPrediction
count 41188.00000 41188.000000 41188.000000 41188.000000 41188.000000 41188.000000 41188.000000 41188.000000 41188.000000 41188.000000 41188.000000
mean 40.02406 258.285010 2.567593 962.475454 0.172963 0.081886 93.575664 -40.502600 3.621291 5167.035911 0.887597
std 10.42125 259.279249 2.770014 186.910907 0.494901 1.570960 0.578840 4.628198 1.734447 72.251528 0.125351
min 17.00000 0.000000 1.000000 0.000000 0.000000 -3.400000 92.201000 -50.800000 0.634000 4963.600000 0.250174
25% 32.00000 102.000000 1.000000 999.000000 0.000000 -1.800000 93.075000 -42.700000 1.344000 5099.100000 0.857484
50% 38.00000 180.000000 2.000000 999.000000 0.000000 1.100000 93.749000 -41.800000 4.857000 5191.000000 0.945879
75% 47.00000 319.000000 3.000000 999.000000 0.000000 1.400000 93.994000 -36.400000 4.961000 5228.100000 0.959214
max 98.00000 4918.000000 56.000000 999.000000 7.000000 1.400000 94.767000 -26.900000 5.045000 5228.100000 0.988360
In [10]:
#Printing the number of nulls in the dataset (though we know from above step(data.info) we have all non null columns):
data.isnull().sum()
Out[10]:
age                0
job                0
marital            0
education          0
default            0
housing            0
loan               0
contact            0
month              0
day_of_week        0
duration           0
campaign           0
pdays              0
previous           0
poutcome           0
emp.var.rate       0
cons.price.idx     0
cons.conf.idx      0
euribor3m          0
nr.employed        0
ModelPrediction    0
y                  0
dtype: int64
In [11]:
#Small code to get counts of each category present in the data's catogorical columns:

category_column = [i for i in data.columns if data[i].dtypes == 'object']

for column in category_column:
    print(column, '\n\n')
    print(data[column].value_counts())
    print("---" *20)
job 


admin.           10422
blue-collar       9254
technician        6743
services          3969
management        2924
retired           1720
entrepreneur      1456
self-employed     1421
housemaid         1060
unemployed        1014
student            875
unknown            330
Name: job, dtype: int64
------------------------------------------------------------
marital 


married     24928
single      11568
divorced     4612
unknown        80
Name: marital, dtype: int64
------------------------------------------------------------
education 


university.degree      12168
high.school             9515
basic.9y                6045
professional.course     5243
basic.4y                4176
basic.6y                2292
unknown                 1731
illiterate                18
Name: education, dtype: int64
------------------------------------------------------------
default 


no         32588
unknown     8597
yes            3
Name: default, dtype: int64
------------------------------------------------------------
housing 


yes        21576
no         18622
unknown      990
Name: housing, dtype: int64
------------------------------------------------------------
loan 


no         33950
yes         6248
unknown      990
Name: loan, dtype: int64
------------------------------------------------------------
contact 


cellular     26144
telephone    15044
Name: contact, dtype: int64
------------------------------------------------------------
month 


may    13769
jul     7174
aug     6178
jun     5318
nov     4101
apr     2632
oct      718
sep      570
mar      546
dec      182
Name: month, dtype: int64
------------------------------------------------------------
day_of_week 


thu    8623
mon    8514
wed    8134
tue    8090
fri    7827
Name: day_of_week, dtype: int64
------------------------------------------------------------
poutcome 


nonexistent    35563
failure         4252
success         1373
Name: poutcome, dtype: int64
------------------------------------------------------------
y 


no     36548
yes     4640
Name: y, dtype: int64
------------------------------------------------------------
In [12]:
#Used the term 'target' as the variable, since that is our result
target_count = data['y'].value_counts()
target_count
Out[12]:
no     36548
yes     4640
Name: y, dtype: int64

Can infer from above analysis that the data is imbalanced, given that "Nos" are 88.73% of the dataset compared to the "Yes's".

Exploratory Data Analysis (EDA)- Data Visualization

In [13]:
#Analysing the column ('y') visually - Yess and Nos

colors = ['Red', 'Green']
trace = pl.Pie(labels =target_count.index, values = target_count.values, pull= [0.05], marker=dict(colors=colors)) 

layout = pl.Layout(title = "Subscribed to the Term Deposit", height = 200, legend= dict(x=1.1, y=1.3))



fig = pl.Figure(data=[trace], layout = layout)

fig.update_layout(height=500, width=600)
fig.show()

Visualization of the data based on data type of the columns

(Categorical or Numerical).

Bar charts to understand the distribution of values within the column

Countplots to understand the distribution of values with in the column

and corresponding to target column ( Subscribed or Not)

This univariate analysis is to understand, the column's distribution and insights on if any sub category within the column has a significance to subscribing or not.

In [14]:
# JOB
sns.set_style('whitegrid')
plt.figure(figsize=(14,7))
sns.countplot(data['job'])
Out[14]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f9550338898>

From above graph, we can say, in the data, clients with admin, blue_collar and technician ctegory jobs are high compared to other job categories.

In [15]:
plt.figure(figsize = (10,5))
ax = sns.countplot(x='job', hue= 'y', data = data, palette = 'Set1')
ax.set_xticklabels(ax.get_xticklabels(), rotation = 40, ha = "right", size=15)

plt.tight_layout()


plt.show()

From above graph, we can infer that highest number of subscriptions are clients with admin job, next highest is technicians. But point here to be noted is that we also have a very high number of clients in that job category who did not subscribe. This is because simply there are more clients in our dataset who fall in that category of job.

In [16]:
# MARITAL
sns.set_style('whitegrid')
plt.figure(figsize=(14,7))
sns.countplot(data['marital'])
Out[16]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f954ffb8fd0>

Simple insight is that the we have majority of this column's distribution in married category.

In [17]:
plt.figure(figsize = (10,5))
ax = sns.countplot(x='marital', hue= 'y', data = data, palette = 'Set1')
ax.set_xticklabels(ax.get_xticklabels(), rotation = 40, ha = "right")
plt.tight_layout()

plt.show()
In [18]:
#DEFAULT
sns.set_style('whitegrid')
plt.figure(figsize=(14,7))
sns.countplot(data['default'])
Out[18]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f9552148860>

From the above graph, it looks like we barely have clients who have credit, major of them do not have credit.

In [19]:
plt.figure(figsize = (10,5))
ax = sns.countplot(x='default', hue= 'y', data = data, palette = 'Set1')
ax.set_xticklabels(ax.get_xticklabels(), rotation = 40, ha = "right")
plt.tight_layout()

plt.show()
In [20]:
#EDUCATION
sns.set_style('whitegrid')
plt.figure(figsize=(14,7))
sns.countplot(data['education'])
Out[20]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f9552153630>
In [21]:
plt.figure(figsize = (10,5))
ax = sns.countplot(x='education', hue= 'y', data = data, palette = 'Set1')
ax.set_xticklabels(ax.get_xticklabels(), rotation = 40, ha = "right")
plt.tight_layout()

plt.show()

From above graphs on education, we can infer that, clients with university degree are a majority, and also highest number of subscribers are also clients with university degree.

In [22]:
# HOUSING
sns.set_style('whitegrid')
plt.figure(figsize=(14,7))
sns.countplot(data['housing'])
Out[22]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f9551e93e10>
In [23]:
plt.figure(figsize = (10,5))
ax = sns.countplot(x='housing', hue= 'y', data = data, palette = 'Set1')
ax.set_xticklabels(ax.get_xticklabels(), rotation = 40, ha = "right")
plt.tight_layout()

plt.show()

We can see that, majority of the clients have a housing loan. One direction we can go for further analysis is that we dig deeper in clients with any kind of loans and if they have subscribed or not.

In [24]:
#LOAN
sns.set_style('whitegrid')
plt.figure(figsize=(14,7))
sns.countplot(data['loan'])
Out[24]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f954fc400b8>
In [25]:
plt.figure(figsize = (10,5))
ax = sns.countplot(x='loan', hue= 'y', data = data, palette = 'Set1')
ax.set_xticklabels(ax.get_xticklabels(), rotation = 40, ha = "right")
plt.tight_layout()

plt.show()

We observe that there are more clients without a personal loan, who have subscribed. But we see that a huge number of clients with no personal loan also , din't subscribe to the deposit. So we can't really say that, peopel with and without loans hold any significance in subscribing or not.

In [26]:
#CONTACT
sns.set_style('whitegrid')
plt.figure(figsize=(14,7))
sns.countplot(data['contact'])
Out[26]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f95522fbf60>
In [27]:
plt.figure(figsize = (10,5))
ax = sns.countplot(x='contact', hue= 'y', data = data, palette = 'Set1')
ax.set_xticklabels(ax.get_xticklabels(), rotation = 40, ha = "right")
plt.tight_layout()

plt.show()
In [28]:
#MONTH
sns.set_style('whitegrid')
plt.figure(figsize=(14,7))
sns.countplot(data['month'])
Out[28]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f9553441f98>

We can infer from plot above that, maximum number of clients were contacted in the month of May. ( Range can be set as May to August).

In [29]:
plt.figure(figsize = (10,5))
ax = sns.countplot(x='month', hue= 'y', data = data, palette = 'Set1')
ax.set_xticklabels(ax.get_xticklabels(), rotation = 40, ha = "right")
plt.tight_layout()

plt.show()

First observation for me is that , there has been very little activity in the month of december.(understandable that it's holiday month). We can say that, there are very few clients who have subscribed in comparision to number of people contacted in summer months. And, rough analysis can be said that in march, april, september and october are a little more succesive months in comparision to other months.

In [30]:
#DAY OF THE WEEK
sns.set_style('whitegrid')
plt.figure(figsize=(14,7))
sns.countplot(data['day_of_week'])
Out[30]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f95535cb860>
In [31]:
plt.figure(figsize = (10,5))
ax = sns.countplot(x='day_of_week', hue= 'y', data = data, palette = 'Set1')
ax.set_xticklabels(ax.get_xticklabels(), rotation = 40, ha = "right")
plt.tight_layout()

plt.show()

By first look of the plot, can infer that averagly number of clients who have subscribed are roughly same through out the week. irrespective of the day of contact, roughly 1000 clients have subscribed. So one underatdning is that, this column wont hold any significance in predicting, due to it's lack of any significance between sub categories.

In [32]:
#POUTCOME
sns.set_style('whitegrid')
plt.figure(figsize=(14,7))
sns.countplot(data['poutcome'])
Out[32]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f9552bfff28>
In [33]:
plt.figure(figsize = (10,5))
ax = sns.countplot(x='poutcome', hue= 'y', data = data, palette = 'Set1')
ax.set_xticklabels(ax.get_xticklabels(), rotation = 40, ha = "right")
plt.tight_layout()

plt.show()

Analysis for this column is that, number of people who have previously subscribed have subscribed for this new term aswell. So it has some significance in predicting. One other insight is that majority of the customers are new customers and this is derived from the bar that shows, there is a huge nummber of clients for whom , previous outcome is unknown. hence ...

In [34]:
sns.boxplot(data = data, x = 'y', y = 'age', hue = "y")
plt.tight_layout()

The boxplot above helps me understand that , the customers who are in this dataset (targeted in the campaign)'s median age is between 35 to 40.'

Both the boxplots ( Yes and No) have similar width ,meaning age isn't playing a significant role if people have subscribed or not

In [35]:
plt.figure(figsize=(10,8))
sns.distplot(data["age"])
Out[35]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f9552f0cf98>

As we can see in the above distribution also, that most of the customers are in the age range of 30-50. Histogram helps us understand the column distrubtion

In [36]:
%matplotlib inline
sns.boxplot(data=data, x="y", y="duration")
plt.show()

From the above plot it is clear that, the duration (last contact duration) of a customer can be useful for predicting the target variable. It is expected because it is already mentioned in the data overview that this field highely affects the target variable and should only be used for benchmark purposes.

In [37]:
plt.figure(figsize=(10,8))
sns.distplot(data["duration"])
plt.show()

This seems like a powerlaw distribution where most the values are very low and very few have high values.

In [38]:
plt.figure(figsize=(12,10))
plt.subplot(2,1,1)
sns.countplot(x='campaign', hue = 'y', data = data, palette ='Set1')
plt.xlim(right = 10)
plt.xlabel('')
plt.subplot(2,1,2)
sns.countplot(x='campaign', hue = 'y', data = data, palette = 'Set1')
plt.xlim(left = 11)
plt.ylim(top = 30)
plt.xlabel('No of Campaigns', fontsize = 14)
plt.show()

This attribute tells us the number of time , teh client was contacted in this campaign. Can be observed that, more number of times the contact , less subscriptions.

In [39]:
sns.boxplot(data=data, x="y", y="campaign")
plt.show()
In [40]:
%matplotlib inline
plt.figure(figsize=(10,8))
sns.distplot(data["campaign"])
plt.show()
In [41]:
data['pdays'].unique()
Out[41]:
array([999,   6,   4,   3,   5,   1,   0,  10,   7,   8,   9,  11,   2,
        12,  13,  14,  15,  16,  21,  17,  18,  22,  25,  26,  19,  27,
        20])
In [42]:
data['pdays'].value_counts()
Out[42]:
999    39673
3        439
6        412
4        118
9         64
2         61
7         60
12        58
10        52
5         46
13        36
11        28
1         26
15        24
14        20
8         18
0         15
16        11
17         8
18         7
19         3
22         3
21         2
26         1
20         1
25         1
27         1
Name: pdays, dtype: int64

Most of the values are 999, which means that the most of the customers have never been contacted before

In [43]:
%matplotlib inline
sns.boxplot(data=data, x="y", y="pdays")
plt.show()
In [44]:
%matplotlib inline
plt.figure(figsize=(10,8))
sns.distplot(data[data["y"]=="yes"]["pdays"])
sns.distplot(data[data["y"]=="no"]["pdays"])
plt.show()
In [45]:
data["previous"].unique()
Out[45]:
array([0, 1, 2, 3, 4, 5, 6, 7])
In [46]:
data["previous"].value_counts()
Out[46]:
0    35563
1     4561
2      754
3      216
4       70
5       18
6        5
7        1
Name: previous, dtype: int64
In [47]:
data[data["y"]=="yes"]["previous"].value_counts()
Out[47]:
0    3141
1     967
2     350
3     128
4      38
5      13
6       3
Name: previous, dtype: int64
In [48]:
data[data["y"]=="no"]["previous"].value_counts()
Out[48]:
0    32422
1     3594
2      404
3       88
4       32
5        5
6        2
7        1
Name: previous, dtype: int64
In [49]:
%matplotlib inline
sns.boxplot(data=data, x="y", y="previous")
plt.show()
In [50]:
%matplotlib inline
plt.figure(figsize=(10,8))
sns.distplot(data["previous"])
plt.show()
In [51]:
%matplotlib inline
plt.figure(figsize=(10,8))
sns.distplot(data[data["y"]=="yes"]["previous"])
sns.distplot(data[data["y"]=="no"]["previous"])
plt.show()

The previous feature is very similarly distributed for both the classes in the target variable. From basic EDA it is not sure how much value this individual feature have on the target variable.

In [52]:
# Marital
sns.set_style('whitegrid')
plt.figure(figsize=(14,7))
sns.countplot(data['previous'])
Out[52]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f95522962b0>
In [53]:
# Easy count plots:
sns.countplot(x='previous', hue= 'y', data = data, palette= 'Set1')
plt.show()
In [54]:
data["emp.var.rate"].value_counts()
Out[54]:
 1.4    16234
-1.8     9184
 1.1     7763
-0.1     3683
-2.9     1663
-3.4     1071
-1.7      773
-1.1      635
-3.0      172
-0.2       10
Name: emp.var.rate, dtype: int64
In [55]:
%matplotlib inline
sns.boxplot(data=data, x="y", y="emp.var.rate")
plt.show()
In [56]:
%matplotlib inline
plt.figure(figsize=(10,8))
sns.distplot(data["emp.var.rate"])
plt.show()
In [57]:
%matplotlib inline
sns.boxplot(data=data, x="y", y="cons.price.idx")
plt.show()
In [58]:
%matplotlib inline
plt.figure(figsize=(10,8))
sns.distplot(data["cons.price.idx"])
plt.show()
In [59]:
%matplotlib inline
sns.boxplot(data=data, x="y", y="cons.conf.idx")
plt.show()
In [60]:
%matplotlib inline
plt.figure(figsize=(10,8))
sns.distplot(data["cons.conf.idx"])
plt.show()
In [61]:
%matplotlib inline
sns.boxplot(data=data, x="y", y="euribor3m")
plt.show()
In [62]:
%matplotlib inline
plt.figure(figsize=(10,8))
sns.distplot(data["euribor3m"])
plt.show()
In [63]:
%matplotlib inline
sns.boxplot(data=data, x="y", y="nr.employed")
plt.show()
In [64]:
%matplotlib inline
plt.figure(figsize=(10,8))
sns.distplot(data["nr.employed"])
plt.show()

Understanding the features well enough to do manual encoding ( changing few useful categorical columns to numeric)

Data Preprocessing for further modeling.

In [65]:
# Cleaning the data and making it ready for modeling.
In [66]:
# Importing the libraries required for machine learning and preprocessing the data:
import pandas as pd
import pickle
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import roc_auc_score
First step in data cleaning is handling the missing values, but seeing the steps above(data.info()), there are no nulls in this dataset.
In [67]:
# Dropping the model prediction column from the dataset
data = data.drop(columns = ['ModelPrediction'])
In [68]:
# Handling the duplicate data.

data_dups = data[data.duplicated(keep = "last")]
data_dups
Out[68]:
age job marital education default housing loan contact month day_of_week ... campaign pdays previous poutcome emp.var.rate cons.price.idx cons.conf.idx euribor3m nr.employed y
1265 39 blue-collar married basic.6y no no no telephone may thu ... 1 999 0 nonexistent 1.1 93.994 -36.4 4.855 5191.0 no
12260 36 retired married unknown no no no telephone jul thu ... 1 999 0 nonexistent 1.4 93.918 -42.7 4.966 5228.1 no
14155 27 technician single professional.course no no no cellular jul mon ... 2 999 0 nonexistent 1.4 93.918 -42.7 4.962 5228.1 no
16819 47 technician divorced high.school no yes no cellular jul thu ... 3 999 0 nonexistent 1.4 93.918 -42.7 4.962 5228.1 no
18464 32 technician single professional.course no yes no cellular jul thu ... 1 999 0 nonexistent 1.4 93.918 -42.7 4.968 5228.1 no
20072 55 services married high.school unknown no no cellular aug mon ... 1 999 0 nonexistent 1.4 93.444 -36.1 4.965 5228.1 no
20531 41 technician married professional.course no yes no cellular aug tue ... 1 999 0 nonexistent 1.4 93.444 -36.1 4.966 5228.1 no
25183 39 admin. married university.degree no no no cellular nov tue ... 2 999 0 nonexistent -0.1 93.200 -42.0 4.153 5195.8 no
28476 24 services single high.school no yes no cellular apr tue ... 1 999 0 nonexistent -1.8 93.075 -47.1 1.423 5099.1 no
32505 35 admin. married university.degree no yes no cellular may fri ... 4 999 0 nonexistent -1.8 92.893 -46.2 1.313 5099.1 no
36950 45 admin. married university.degree no no no cellular jul thu ... 1 999 0 nonexistent -2.9 92.469 -33.6 1.072 5076.2 yes
38255 71 retired single university.degree no no no telephone oct tue ... 1 999 0 nonexistent -3.4 92.431 -26.9 0.742 5017.5 no

12 rows × 21 columns

In [69]:
#Getting the count of duplicated values
data_dups.shape
Out[69]:
(12, 21)
In [70]:
#### Looks like we have 12 duplicated rows. SO dropping the duplicate rows is adviced.
data = data.drop_duplicates()
In [71]:
data.shape
Out[71]:
(41176, 21)

Get the target (class variable) and independent features

In [72]:
data_x = data.iloc[:, :-1]
print("Shape of X: ", data_x.shape)
Shape of X:  (41176, 20)
In [73]:
data_y = data['y']
print('Shape of Y: ', data_y.shape)
Shape of Y:  (41176,)

Splitting the data into train and test sets

In [74]:
X_rest, X_test, y_rest, y_test = train_test_split(data_x, data_y, test_size = 0.2)
X_train, X_cv, y_train, y_cv = train_test_split(X_rest, y_rest, test_size =0.2)
In [75]:
print("X Train:", X_train.shape)
print("X CV:", X_cv.shape)
print("X Test:", X_test.shape)
print("Y Train:", y_train.shape)
print("Y CV:", y_cv.shape)
print("Y Test:", y_test.shape)
X Train: (26352, 20)
X CV: (6588, 20)
X Test: (8236, 20)
Y Train: (26352,)
Y CV: (6588,)
Y Test: (8236,)
In [76]:
# Label encoding-  replacing the "Nos" with 0 and "Yes'" with 1

y_train.replace({'no' : 0, "yes" : 1}, inplace = True)
y_cv.replace({'no' : 0, "yes" : 1}, inplace = True)
y_test.replace({'no' : 0, "yes" : 1}, inplace = True)

Encoding Categorical Features - Two methods for encoding are 'One hot encoding' and 'Response encoding'. But i am using 'One Hot Encoding'.

In [77]:
# One big step before training any machine learning model is converting the categorical variables to numerical values.
# Some high efficient algos do but many machine learning algorithms cannot operate on label data directly.
# They require all input variables and output variables to be numeric.
# This means that categorical data must be converted to a numerical form.
# We perfrom this method when no natural ordering is present in the categorial values. 
In [78]:
#Categorical condition to extract categorical columns 
categorical_condn = data_x.dtypes == object
In [79]:
categorical_condn
Out[79]:
age               False
job                True
marital            True
education          True
default            True
housing            True
loan               True
contact            True
month              True
day_of_week        True
duration          False
campaign          False
pdays             False
previous          False
poutcome           True
emp.var.rate      False
cons.price.idx    False
cons.conf.idx     False
euribor3m         False
nr.employed       False
dtype: bool
In [80]:
# Filtering the categorical columns and printing out
categorical_cols = data_x.columns[categorical_condn].tolist()
In [81]:
categorical_cols
Out[81]:
['job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'month',
 'day_of_week',
 'poutcome']
In [82]:
from sklearn.feature_extraction.text import CountVectorizer

def add_onehot_to_dataframe(sparse, df, vectorizer, name):
    for i, col in enumerate(vectorizer.get_feature_names()):
        colname = name+"_"+col
        df[colname] = sparse[:, i].toarray().ravel().tolist()
        
    return df
def OneHotEncoder(categorical_cols, X_train, X_test, X_cv = None, include_cv = False):
    for i in categorical_cols:
        Vectorizer = CountVectorizer(token_pattern="[A-Za-z0-9-.]+")
        print("Encoding for feature: ", i)
        temp_cols = Vectorizer.fit_transform(X_train[i])
        X_train = add_onehot_to_dataframe(temp_cols, X_train, Vectorizer, i)
        if include_cv:
            temp_cols = Vectorizer.transform(X_cv[i])
            X_cv = add_onehot_to_dataframe(temp_cols, X_cv, Vectorizer, i)
            
        temp_cols = Vectorizer.transform(X_test[i])
        X_test = add_onehot_to_dataframe(temp_cols, X_test, Vectorizer, i)
        
In [83]:
OneHotEncoder(categorical_cols, X_train, X_test, X_cv, True)
Encoding for feature:  job
/Users/aishwaryamaddimsetty/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:6: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

Encoding for feature:  marital
Encoding for feature:  education
Encoding for feature:  default
Encoding for feature:  housing
Encoding for feature:  loan
Encoding for feature:  contact
Encoding for feature:  month
Encoding for feature:  day_of_week
Encoding for feature:  poutcome
In [84]:
#Droping the categorical columns once encoding is done
X_train = X_train.drop(categorical_cols, axis = 1)
X_cv = X_cv.drop(categorical_cols, axis = 1)
X_test = X_test.drop(categorical_cols, axis = 1)

print("Shape of train:", X_train.shape)
print("Shape of CV:", X_cv.shape)
print("Shape of test:", X_test.shape)
Shape of train: (26352, 63)
Shape of CV: (6588, 63)
Shape of test: (8236, 63)
In [85]:
X_train.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 26352 entries, 35894 to 16559
Data columns (total 63 columns):
age                              26352 non-null int64
duration                         26352 non-null int64
campaign                         26352 non-null int64
pdays                            26352 non-null int64
previous                         26352 non-null int64
emp.var.rate                     26352 non-null float64
cons.price.idx                   26352 non-null float64
cons.conf.idx                    26352 non-null float64
euribor3m                        26352 non-null float64
nr.employed                      26352 non-null float64
job_admin.                       26352 non-null int64
job_blue-collar                  26352 non-null int64
job_entrepreneur                 26352 non-null int64
job_housemaid                    26352 non-null int64
job_management                   26352 non-null int64
job_retired                      26352 non-null int64
job_self-employed                26352 non-null int64
job_services                     26352 non-null int64
job_student                      26352 non-null int64
job_technician                   26352 non-null int64
job_unemployed                   26352 non-null int64
job_unknown                      26352 non-null int64
marital_divorced                 26352 non-null int64
marital_married                  26352 non-null int64
marital_single                   26352 non-null int64
marital_unknown                  26352 non-null int64
education_basic.4y               26352 non-null int64
education_basic.6y               26352 non-null int64
education_basic.9y               26352 non-null int64
education_high.school            26352 non-null int64
education_illiterate             26352 non-null int64
education_professional.course    26352 non-null int64
education_university.degree      26352 non-null int64
education_unknown                26352 non-null int64
default_no                       26352 non-null int64
default_unknown                  26352 non-null int64
default_yes                      26352 non-null int64
housing_no                       26352 non-null int64
housing_unknown                  26352 non-null int64
housing_yes                      26352 non-null int64
loan_no                          26352 non-null int64
loan_unknown                     26352 non-null int64
loan_yes                         26352 non-null int64
contact_cellular                 26352 non-null int64
contact_telephone                26352 non-null int64
month_apr                        26352 non-null int64
month_aug                        26352 non-null int64
month_dec                        26352 non-null int64
month_jul                        26352 non-null int64
month_jun                        26352 non-null int64
month_mar                        26352 non-null int64
month_may                        26352 non-null int64
month_nov                        26352 non-null int64
month_oct                        26352 non-null int64
month_sep                        26352 non-null int64
day_of_week_fri                  26352 non-null int64
day_of_week_mon                  26352 non-null int64
day_of_week_thu                  26352 non-null int64
day_of_week_tue                  26352 non-null int64
day_of_week_wed                  26352 non-null int64
poutcome_failure                 26352 non-null int64
poutcome_nonexistent             26352 non-null int64
poutcome_success                 26352 non-null int64
dtypes: float64(5), int64(58)
memory usage: 12.9 MB
In [86]:
#Printing the dataset to ensure our whole dataset is numeric now.
X_train.head(5)
Out[86]:
age duration campaign pdays previous emp.var.rate cons.price.idx cons.conf.idx euribor3m nr.employed ... month_oct month_sep day_of_week_fri day_of_week_mon day_of_week_thu day_of_week_tue day_of_week_wed poutcome_failure poutcome_nonexistent poutcome_success
35894 59 249 3 999 0 -1.8 92.893 -46.2 1.259 5099.1 ... 0 0 1 0 0 0 0 0 1 0
30954 46 159 2 999 1 -1.8 92.893 -46.2 1.344 5099.1 ... 0 0 0 0 0 1 0 1 0 0
7673 50 66 4 999 0 1.1 93.994 -36.4 4.864 5191.0 ... 0 0 1 0 0 0 0 0 1 0
7530 34 83 2 999 0 1.1 93.994 -36.4 4.864 5191.0 ... 0 0 1 0 0 0 0 0 1 0
35335 32 65 5 999 0 -1.8 92.893 -46.2 1.250 5099.1 ... 0 0 1 0 0 0 0 0 1 0

5 rows × 63 columns

In [87]:
pd.set_option('display.max_columns', None)
X_train.head(5)
Out[87]:
age duration campaign pdays previous emp.var.rate cons.price.idx cons.conf.idx euribor3m nr.employed job_admin. job_blue-collar job_entrepreneur job_housemaid job_management job_retired job_self-employed job_services job_student job_technician job_unemployed job_unknown marital_divorced marital_married marital_single marital_unknown education_basic.4y education_basic.6y education_basic.9y education_high.school education_illiterate education_professional.course education_university.degree education_unknown default_no default_unknown default_yes housing_no housing_unknown housing_yes loan_no loan_unknown loan_yes contact_cellular contact_telephone month_apr month_aug month_dec month_jul month_jun month_mar month_may month_nov month_oct month_sep day_of_week_fri day_of_week_mon day_of_week_thu day_of_week_tue day_of_week_wed poutcome_failure poutcome_nonexistent poutcome_success
35894 59 249 3 999 0 -1.8 92.893 -46.2 1.259 5099.1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0
30954 46 159 2 999 1 -1.8 92.893 -46.2 1.344 5099.1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 0 0
7673 50 66 4 999 0 1.1 93.994 -36.4 4.864 5191.0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0
7530 34 83 2 999 0 1.1 93.994 -36.4 4.864 5191.0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0
35335 32 65 5 999 0 -1.8 92.893 -46.2 1.250 5099.1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 1 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0
In [88]:
# Just a best practice step - saving the encoded dataset to local computer. (Can be used for small excell filtering too in needed)
data_x.to_csv("encoded_data_x.csv")
In [89]:
# Advice before building any model on this dataset is to drop the 'Duration' column , due to it's high correlation with the target variable.

Above step concludes the data processing and prepping the data for further machine learning training on the dataset.

In [90]:
# Creating a simple machine learning model - I've picked logistic regression model here for the reason that, 
# it best fits our dataset in case of a textbook defination in my view.
# We have a binary classification model. so hence a logistic regression model.
In [91]:
# Building the model with 'Duration' column. 
In [92]:
model = LogisticRegression(class_weight = 'balanced')
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)

print("AUC score:", roc_auc_score(y_test, y_pred[:,1]))
/Users/aishwaryamaddimsetty/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning:

Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.

AUC score: 0.9370765314811211
In [93]:
print(y_pred)
[[0.67095133 0.32904867]
 [0.9630996  0.0369004 ]
 [0.9668809  0.0331191 ]
 ...
 [0.97943918 0.02056082]
 [0.05993991 0.94006009]
 [0.90638602 0.09361398]]
In [94]:
Y_ProbList =list(y_pred)
In [95]:
# To be ran
model.score(X_train,y_train)
Out[95]:
0.8607695810564663
In [ ]:
 

Building Logistic Regression model

In [86]:
data.shape
Out[86]:
(41188, 22)
In [87]:
data_new = data.drop(columns = ['ModelPrediction'])
In [88]:
data_new.shape
Out[88]:
(41188, 21)
In [89]:
contact = ({'cellular':0, 'telephone':1})
data_new['contact'] = data_new['contact'].map(contact)
In [90]:
data_new = pd.get_dummies(data_new, columns = ['job','marital','education','default',
                                   'housing','loan','month',
                                   'day_of_week','poutcome'], drop_first = True)
In [91]:
data_new.shape
Out[91]:
(41188, 54)
In [92]:
# Importing required scikit learn's libraries
In [97]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
import warnings
warnings.filterwarnings('ignore')
print('.....Setup complete')
import os
.....Setup complete
In [94]:
#group data into X set and y set, where X has all features and y has the label 
X = data_new.loc[:,data_new.columns != 'y']
y = data_new.loc[:,data_new.columns == 'y']

X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = 0.3 , random_state = 0)
In [95]:
#print lenght of both test and train set
print('shape of X_train : ',len(X_train), '\nshape of y_train : ',len(y_train))
print('\nshape of X_test  : ',len(X_test), '\nshape of y_test  : ',len(y_test))
shape of X_train :  28831 
shape of y_train :  28831

shape of X_test  :  12357 
shape of y_test  :  12357
In [98]:
#Normalize the data 
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
In [99]:
#train model using logistic regression
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X_train,y_train)
clf.score(X_train,y_train)
Out[99]:
0.9107904685928341
In [100]:
clf.score(X_test,y_test)
Out[100]:
0.9121145909201263
In [101]:
#predict X_test and save in variable y_pred
y_pred = clf.predict(X_test)
In [102]:
#Evaluate the model
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test,y_pred)
cm
Out[102]:
array([[10669,   300],
       [  786,   602]])
In [103]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))
              precision    recall  f1-score   support

          no       0.93      0.97      0.95     10969
         yes       0.67      0.43      0.53      1388

   micro avg       0.91      0.91      0.91     12357
   macro avg       0.80      0.70      0.74     12357
weighted avg       0.90      0.91      0.90     12357

In [ ]: